import jieba
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE


# 读取数据
def loadArticle(fileName):
    '''
	读取原始自用数据集的测试文章
	:param fileName: 文件名
	:return: 处理之后的文章
	'''
    # 我们需要将其空格去掉
    with open(fileName, encoding='utf-8') as file:
        # 按行读取
        test_article = []
        for line in file.readlines():
            # 去除空格，以及换行符
            line = line.replace("<content>", "")
            line = line.replace("</content>", "")
            line = line.replace(" ", "")
            line = line.replace("　", "")
            line = line.strip()
            test_article.append(line)
    return test_article


def test(file_path, n_clusters):
    # 加载停用词表
    stopwords = set()
    with open('cluster/stopwords.txt', 'r', encoding='utf-8') as f:
        for line in f:
            stopwords.add(line.strip())
    # 加载文本数据
    texts = loadArticle(file_path)
    # 将文本数据分词，并去除停用词和低频词
    words_list = []
    for text in texts:
        words = list(jieba.cut(text))
        words = [word for word in words if word not in stopwords and len(word) > 1]
        words_list.append(' '.join(words))

    # 对分词后的文本进行 TF-IDF 向量化
    vectorizer = CountVectorizer(max_features=1000)
    X = vectorizer.fit_transform(words_list)

    # 使用层次聚类算法进行聚类
    cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', distance_threshold=None)
    cluster.fit(X.toarray())

    # 将不同类的文本保存到不同的 txt 文件中
    for i in range(cluster.n_clusters_):
        with open('cluster/Agglomerative_Clustering/cluster_{}.txt'.format(i), 'w', encoding='utf-8') as f:
            for j in range(len(texts)):
                if cluster.labels_[j] == i:
                    f.write('{}\n'.format(texts[j]))
    out_top_words = ""
    # 对每个类总结出关键词
    for i in range(cluster.n_clusters_):
        cluster_texts = []
        with open('cluster/Agglomerative_Clustering/cluster_{}.txt'.format(i), 'r', encoding='utf-8') as f:
            for line in f:
                cluster_texts.append(line.strip())
        cluster_words = []
        for text in cluster_texts:
            words = list(jieba.cut(text))
            words = [word for word in words if word not in stopwords and len(word) > 1]
            cluster_words.extend(words)
        word_count = {}
        for word in cluster_words:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
        sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
        top_words = [w[0] for w in sorted_words[:10]]
        print('Agglomerative_Clustering/Cluster {}: {}'.format(i, ', '.join(top_words)))
        out_top_words = out_top_words + 'Agglomerative_Clustering/Cluster {}: {}'.format(i, ', '.join(top_words)) + "\n"
    return out_top_words
